{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读入评分数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  \\\n",
       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id source_system_tab  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "    source_screen_name      source_type  target  \n",
       "0              Explore  online-playlist       1  \n",
       "1  Local playlist more   local-playlist       1  \n",
       "2  Local playlist more   local-playlist       1  \n",
       "3  Local playlist more   local-playlist       1  \n",
       "4              Explore  online-playlist       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "dtype={\"msno\": object,\"song_id\": object,\"source_system_tab\": object,\"source_screen_name\": object,\"source_type\": object,\"target\": int}\n",
    "dpath = '../Data/'\n",
    "df_kkbox = pd.read_csv(dpath+'train.csv', dtype=dtype)\n",
    "df_kkbox.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 7377418 entries, 0 to 7377417\n",
      "Data columns (total 6 columns):\n",
      "msno                  object\n",
      "song_id               object\n",
      "source_system_tab     object\n",
      "source_screen_name    object\n",
      "source_type           object\n",
      "target                int64\n",
      "dtypes: int64(1), object(5)\n",
      "memory usage: 337.7+ MB\n"
     ]
    }
   ],
   "source": [
    "df_kkbox.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "7377418个样本，5个特征1个标签列"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 查看缺省值情况"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "msno 缺省值数： 0\n",
      "song_id 缺省值数： 0\n",
      "source_system_tab 缺省值数： 24849\n",
      "source_screen_name 缺省值数： 414804\n",
      "source_type 缺省值数： 21539\n",
      "target 缺省值数： 0\n"
     ]
    }
   ],
   "source": [
    "for key in dtype:\n",
    "    print(key,'缺省值数：',len(df_kkbox[pd.isna(df_kkbox[key])==True]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "source_system_tab，source_screen_name和source_type各有24849,414804和21539个缺省值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 计算用户数、音乐数目"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of users = 30755\n",
      "Number of items = 359966\n"
     ]
    }
   ],
   "source": [
    "n_users = df_kkbox['msno'].unique().shape[0]\n",
    "n_items = df_kkbox['song_id'].unique().shape[0]\n",
    "print ('Number of users = ' + str(n_users) + '\\n'+ 'Number of items = ' + str(n_items) )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "有继续订阅的用户数为30755，少于被继续订阅的物品数量359966"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_clicked = df_kkbox[df_kkbox['target']==1]\n",
    "user_freq = df_clicked['msno'].value_counts() \n",
    "item_freq = df_clicked['song_id'].value_counts()\n",
    "del df_clicked"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "o+5RNlSWrzvrphgBNGIo1FLkGxBgyICns6qXj3nS7Pk=    3288\n",
       "KGXNZ/H3VxvET/+rGxlrAe7Gpz2eKMXyuSg3xh8Ij1M=    2818\n",
       "FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=    2791\n",
       "MXIMDXO0j3UpaT7FvOSGW6Y5zfhlh+xYjTqGoUdMzEE=    2069\n",
       "dU4RbzpIRRd/EkA9Xncpy9CglzDBZp7nKMfdnfr3Aj8=    1978\n",
       "4DlS5LiANEKlMTQD08dakyV1JWwPYI/n1wyjx4ImjPc=    1814\n",
       "JkQacE3rvmhh65R04eDLbu+M7MCkpzmHwMGrbZo0puc=    1806\n",
       "uCDJmSDTzA5PpnYGqKvNmFbwBIoS3jIXFbO5YfoCv/Y=    1794\n",
       "1wiu2UfN0BNhHfPTrDS3S/rQkX/vnwDoKH6ODTUdH9k=    1737\n",
       "cqjRBV/jWN2ujhc+z/4tz+Mj6xEfflAAt6qBXCqxKvw=    1735\n",
       "7xiBI5xU3P2/IiR9teB7jySfzWo96JGikGajteLd3Cs=    1715\n",
       "HVcWdf8CEo9s6qwt5V7TpoPYJfRlQSTA6b7kxEAImpc=    1710\n",
       "mDJCU+fKu/mbdk9l4SmPYiJwpeMyK44o9wmG1X3735A=    1607\n",
       "V39VxDdE9SgMQZnX58oQChm9W056pw2lC3TiifMfkKw=    1602\n",
       "LThaiVqGGnVTPmTcmwN/LLo4fVb5dzkduzd7s1SgzIA=    1578\n",
       "EZmIP7zKgqKwpEHnuDjRIkzCmcLNlHn4hkiglZ5LxK8=    1539\n",
       "00qmlWC3H/cjf2Ig1li5xIO0VNDRgBX0f9ilouDBEZs=    1533\n",
       "x+k/zTEqdgYq6O+r1iDjKRaQnaCS5118gzBe8c12r1Y=    1503\n",
       "K7obY5lUlZTd6a/byE9NG/e7Q/dlgHfppLbYk8OL7os=    1498\n",
       "saNgxsUQwx4IXBRgFF0w3uJN6KzvQBEv3dI7dDaz97g=    1488\n",
       "hYJpPvGod6vy09TnlXdQe3Q0vlxju5u5Ruf8V2XkTio=    1476\n",
       "xQp8uu2ljPWqokw6Eg+4TCB/GUC/EZwFMcapcleGiDw=    1469\n",
       "vBNHobQRQsofef7+9NZk4C6Y6T2PBXSGXUbn6c8UOcg=    1448\n",
       "T+FJrpYi77CWTUE5nO/vliojRaTPgt2bRUQHy5lhXrQ=    1406\n",
       "ocQB699RoI5sBlRUfv8tu++vqupI35/95l3U507jdpw=    1383\n",
       "MxCUQ6tAD7LU1yR0gwRck91ZKE88gY8kD/rX9Hi9srQ=    1381\n",
       "YU6fAgCFgPkaJ1YSW2838KzGTxmBCfgousTO8jwuHYM=    1362\n",
       "wv0o115CdKSsFlsV7VvHikjyxLG0LFc5qALhM8xkUZ4=    1353\n",
       "6aH1YAi5YwvZUHtpjEaMQwM0p4wP4XLrkWQSdXDF9L8=    1341\n",
       "akW+xn32y3X+ZisQokWwV8+caojTEpm3E8lV8E3yuBk=    1326\n",
       "                                                ... \n",
       "w6E69PqgScncJu/ma3y6nxZP08BjmsnWZVrBjcQZS1E=       1\n",
       "hZ4kF/3r+IOCcO+h0oPXypRkx17TF0LI0tq1qATx4/4=       1\n",
       "FJJyeVwJxvpwjuoRoOXlRxfgml1O6fWWrCqRi0UfQzI=       1\n",
       "tonuTpOf1pil7Dtlv33rW9jzwI64f/gStbe+YMkz+ps=       1\n",
       "d8Xlb6yL0uIhLslB62iZtI/yzeKuC0p4rN8kem5KyQQ=       1\n",
       "BVH7onzVzpV6ZeUxvbxgZNPTrVuzMEnWpztRAs69BI8=       1\n",
       "NzQRDqFhTiGmzW6DLRKGzj0gTi0o+aAhOyJorhfR9PE=       1\n",
       "QflMxT8G80qwpGK4TobvhcqEVNFvZ5ys3RJvoN2qzzY=       1\n",
       "pk7uJkSC0Wam9uN0zjoKwQKpneCkwD6zXVVwR934oa4=       1\n",
       "LigKORoHn7g04jWYrRErZqrY/fRp4zCiFJvBArOnb6M=       1\n",
       "EvdvMVDpNrMbkddaUvdR5aZJmeI7I4ramwnF7jKsb0g=       1\n",
       "NXT4XxGHOerIjHQTVJQA92KCbhQZaaG6IYz+JNjRCew=       1\n",
       "7gnmJdzCWNJ2ODUJHrJLAE8pg+f+Em6DsIjiqXQLfaU=       1\n",
       "b+xj5C5/msuSDjbFEiviRTKvNJoxlIDZILzI8CGpRIs=       1\n",
       "F/rTuqoIscXncotGm7eXwNUOZu9Oin6Z8TvojC5KnNg=       1\n",
       "9mWfxVQGCkQqlxptytZ8cT6DnUWVPOi7iePOSk0rOm0=       1\n",
       "cPC8QWs6qmvLjAEvmJqh6qBt8DGfzuoFYGKD92G7BE4=       1\n",
       "AzBBHJad/vfnkqukQGGyY2CLJHiT9MN7mNqZx73bGMM=       1\n",
       "yQvLsLNxfI2GnAhu8twA2Cfwk1LH7MiakQlNjmtO4Zw=       1\n",
       "wizayd4Nw8myCFq3+MTlqbSiDTmSfdAA1uuVrqbJt24=       1\n",
       "S7Snd9cjceaC3J9rNxZAU047aH7lMWmhtioxWyvlymc=       1\n",
       "tlNoVym1T3ADyfe3kwf/e08iSK150s1ft1c9KxUDFkg=       1\n",
       "gBTB2h84yd5/B0CzKDipuLVRlhEECaL1r/eD8/WWVBs=       1\n",
       "Om0+q4FTmkI5glWPV72uMfsxczsTCCoZ6l+FxgkoNa4=       1\n",
       "cggvIKhBJ0fb1alCDHmzMrzvPWxC7hred329XgUAjkE=       1\n",
       "ILXCAMpdQctx/e+2PDcElLEblUPAk53Bj+5uQXwT6kw=       1\n",
       "QNZw5fo9V7A1LUprvCHv42EzeONNvp4bODz23dNzFUo=       1\n",
       "7r4e17NC2HdSBggUqNkc/Oc3y+NunRHfzIDx/2zI6Ik=       1\n",
       "J2ITtKRJkq+HZ+ZsLcAJMF3McwbaLchK949YuJktYNo=       1\n",
       "cRh0n734kTqlzBrOM1J4zIBjvCgMOptse204wNpq5aQ=       1\n",
       "Name: msno, Length: 27113, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_freq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "最活跃的用户，连续订阅了3288次，最不活跃的用户才1次（可能为游客，或者误点击，或纯粹不喜欢不愿继续订阅），用户活跃程度分化严重"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "reXuGcEWDDCnL0K3Th//3DFG4S1ACSpJMzA+CFipo1g=    10885\n",
       "T86YHdD4C9JSc274b1IlMkLuNdz4BQRB50fWWE7hx9g=    10556\n",
       "FynUyq0+drmIARmK1JZ/qcjNZ7DKkqTY6/0O0lTzNUI=     9808\n",
       "wBTWuHbjdjxnG1lQcbqnK4FddV24rUhuyrYLd9c/hmk=     9411\n",
       "PgRtmmESVNtWjoZHO5a1r21vIz9sVZmcJJpFCbRa1LI=     9004\n",
       "U9kojfZSKaiWOW94PKh1Riyv/zUWxmBRmv0XInQWLGw=     8787\n",
       "YN4T/yvvXtYrBVN8KTnieiQohHL3T9fnzUkbLWcgLro=     8780\n",
       "M9rAajz4dYuRhZ7jLvf9RRayVA3os61X/XXHEuW4giA=     8403\n",
       "43Qm2YzsP99P5wm37B1JIhezUcQ/1CDjYlQx6rBbz2U=     8112\n",
       "J4qKkLIoW7aYACuTupHLAPZYmRp08en1AEux+GSUzdw=     7903\n",
       "cy10N2j2sdY/X4BDUcMu2Iumfz7pV3tqE5iEaup2yGI=     7725\n",
       "750RprmFfLV0bymtDH88g24pLZGVi5VpBAI300P6UOA=     7608\n",
       "IKMFuL0f5Y8c63Hg9BXkeNJjE0z8yf3gMt/tOxF4QNE=     7224\n",
       "+SstqMwhQPBQFTPBhLKPT642IiBDXzZFwlzsLl4cGXo=     7061\n",
       "DLBDZhOoW7zd7GBV99bi92ZXYUS26lzV+jJKbHshP5c=     6901\n",
       "v/3onppBGoSpGsWb8iaCIO8eX5+iacbH5a4ZUhT7N54=     6879\n",
       "p/yR06j/RQ2J6yGCFL0K+1R06OeG+eXcwxRgOHDo/Tk=     6536\n",
       "Xpjwi8UAE2Vv9PZ6cZnhc58MCtl3cKZEO1sdAkqJ4mo=     6399\n",
       "OaEbZ6TJ1NePtNUeEgWsvFLeopkSln9WQu8PBR5B3+A=     6187\n",
       "BITuBuNyXQydJcjDL2BUnCu4/IXaJg5IPOuycc/4dtY=     6160\n",
       "BgqjNqzsyCpEGvxyUmktvHC8WO5+FQO/pQTaZ4broMU=     6140\n",
       "3VkD5ekIf5duJm1hmYTZlXjyl0zqV8wCzuAh3uocfCg=     6012\n",
       "8Ckw1wek5d6oEsNUoM4P5iag86TaEmyLwdtrckL0Re8=     6003\n",
       "n+pMhj/jpCnpiUcSDl4k3i9FJODDddEXmpE48/HczTI=     5787\n",
       "WL4ipO3Mx9pxd4FMs69ha6o9541+fLeOow67Qkrfnro=     5784\n",
       "/70HjygVDhHsKBoV8mmsBg/WduSgs4+Zg6GfzhUQbdk=     5588\n",
       "L6w2d0w84FjTvFr+BhMfgu7dZAsGiOqUGmvvxIG3gvQ=     5480\n",
       "fEAIgFRWmhXmo6m3ukQeqRksZCcO/7CjkqNckRHiVQo=     5460\n",
       "+Sm75wnBf/sjm/QMUAFx8N+Ae04kWCXGlgH50tTeM6c=     5412\n",
       "VkDBgh89umc9m6uAEfD6LXngetyGhln4vh/ArCGO0nY=     5361\n",
       "                                                ...  \n",
       "KNbfj+ufFJIS8t/KCOar7BW/udXrqy8R4dgyUj6JbIk=        1\n",
       "Pj90ebQUI/02e9plWbd0ln4Fb7Hd9aAThdUdcM0Qqho=        1\n",
       "f/jWotV6dRC7ivWbNO7rNKqmAyI483Rf4rWQjf9VH/g=        1\n",
       "eJ1s0jT1JrK2BZJISXN6HwWxAZNieQr5Wq+jsN76Xr0=        1\n",
       "V7rZuPTGXPI8O2YwhL92ltUExqZDOT7Vz21kBVZUOVc=        1\n",
       "/zohsDTG2wEHhmRSwuWsu8MJTpE8CwTKAUzckkYQQwY=        1\n",
       "PSoTiaMRDdqPM72Zed8aBb8qx0lZhF7r6k4rqsd/pWs=        1\n",
       "PMFF9BpX+iQmWqUzpILfuzBPzFXh/dQN+E/q/Ci8gAw=        1\n",
       "2oIFGTGd/idEOvA4gQ9Qz6VNz30NFc2moLlZ7T87040=        1\n",
       "hJONGmClbF8WXU4lNRNxaY/ReYD+C+mLDnFr9cN6EX0=        1\n",
       "pm8dW0MiCoYxxg+ias1q5B0TMEMVxgnH8kHnuVQ2POM=        1\n",
       "0wWIDzJ3X33rlfbUH4wPLecxT9nl8REBBjrdHL7D2eo=        1\n",
       "FXJbEx8P2l3seqPSaFbKwwha0Ej8auyV4udg3V0mZg0=        1\n",
       "K3r9wEcJap2VoJf6OoIjzjHKfOUJLWjCldLUjNjHl/0=        1\n",
       "dHUzXZX6BV93lYpwzNbtIM8/EkJp2NrcMjF9E0mBy/8=        1\n",
       "Jbmino3Nx9tfG3Ygu8qZNEYZ8nrmOPYXG+ytskPWbn0=        1\n",
       "NbK0MzWeOevrPP6mK+8i1+I3Wx01Z6KkupP5UfOcf9Y=        1\n",
       "gBLpiPtqrVu6XjF7eKxGgRitb2/RPAGXnnWlvVPPZl0=        1\n",
       "PIgW1OK8B+CGlbNOWm639HggekffldVNOc9e18iqmGs=        1\n",
       "8V2lQOPUY3O2IC8mvlaZqLwuNWkfN5Aedxym6w8A04I=        1\n",
       "C4+RxbtyoZLckFxQBexJf8AwDUWmgtZG+HtbhcoXvOg=        1\n",
       "93PMYfnLh9F8mcVVTyGY3C2aPLmgHxHMjE30JtarIDA=        1\n",
       "Dhp2t334qh8JBFftkbWImwTbgswmrRpBf+oYxzCqDmI=        1\n",
       "rfEIcfI8emWS0zI393VnKV56RUZi8Lm7Jy+UuSUmn9c=        1\n",
       "pjZcbqhAMhCi+gb+3+rT4cDEtoYHbvGSIAA7TYuGWag=        1\n",
       "4qUqS53u+svVMMCDnqsI1MeZMpMt7TzJtpPgIt1VTqI=        1\n",
       "/sVSh1mqQFOHf/Y5XlyvexFO+N2GcehhcwjF6GoPLSI=        1\n",
       "qTsNZ+c+Sm6CtpROOV5EB+gv/DGbKsAzcTCVsJ8Ek5I=        1\n",
       "YrlddWhx7cPJ9udNeuMgyuCYGMTJEi+5zrnyeXw3Ans=        1\n",
       "ln9DXIjJkJoIs7Pzkamcal46LLYFxwNYegloz/GPZ18=        1\n",
       "Name: song_id, Length: 223723, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_freq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "最热门的歌曲，被继续订阅了10885次，很多歌曲则小于等于1次，长尾效应很明显。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['explore', 'my library', 'search', 'discover', nan, 'radio',\n",
       "       'listen with', 'notification', 'settings'], dtype=object)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_kkbox['source_system_tab'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "explore 被继续订阅的概率： 42.21460086097565\n",
      "my library 被继续订阅的概率： 61.965897094224\n",
      "search 被继续订阅的概率： 42.136194299246256\n",
      "discover 被继续订阅的概率： 41.57697228223262\n",
      "radio 被继续订阅的概率： 22.26615845152412\n",
      "listen with 被继续订阅的概率： 32.65807995628127\n",
      "notification 被继续订阅的概率： 37.80113177041229\n",
      "settings 被继续订阅的概率： 59.09090909090909\n",
      "nan 被继续订阅的概率： 51.036259004386494\n"
     ]
    }
   ],
   "source": [
    "for source_system_tab in ['explore', 'my library', 'search', 'discover', 'radio','listen with', 'notification', 'settings']:\n",
    "    data = df_kkbox[df_kkbox['source_system_tab']==source_system_tab]\n",
    "    ratio = len(data[data['target']==1])*100.0/len(data)\n",
    "    print(source_system_tab,'被继续订阅的概率：',ratio)\n",
    "    \n",
    "data = df_kkbox[pd.isna(df_kkbox['source_system_tab'])==True]\n",
    "print('nan 被继续订阅的概率：',len(data[data['target']==1])*100.0/len(data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "source_system_tab: 触发事件的类型/tab，用于表示app的功能类型，与是否继续订阅有联系，但是无法作为R矩阵的评分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Explore', 'Local playlist more', nan, 'My library',\n",
       "       'Online playlist more', 'Album more', 'Discover Feature',\n",
       "       'Unknown', 'Discover Chart', 'Radio', 'Artist more', 'Search',\n",
       "       'Others profile more', 'Search Trends', 'Discover Genre',\n",
       "       'My library_Search', 'Search Home', 'Discover New',\n",
       "       'Self profile more', 'Concert', 'Payment'], dtype=object)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_kkbox['source_screen_name'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Explore 被继续订阅的概率： 44.81490696967184\n",
      "Local playlist more 被继续订阅的概率： 63.69833734072403\n",
      "My library 被继续订阅的概率： 65.71729402474335\n",
      "Online playlist more 被继续订阅的概率： 41.4939031690236\n",
      "Album more 被继续订阅的概率： 39.055255666942756\n",
      "Discover Feature 被继续订阅的概率： 36.41042227917755\n",
      "Unknown 被继续订阅的概率： 33.915451356839576\n",
      "Discover Chart 被继续订阅的概率： 51.70318920892267\n",
      "Radio 被继续订阅的概率： 21.72564161469607\n",
      "Artist more 被继续订阅的概率： 41.68578095226777\n",
      "Search 被继续订阅的概率： 47.17491884068653\n",
      "Others profile more 被继续订阅的概率： 31.262915334869547\n",
      "Search Trends 被继续订阅的概率： 37.69806338028169\n",
      "Discover Genre 被继续订阅的概率： 34.788691272718424\n",
      "My library_Search 被继续订阅的概率： 61.122306619128814\n",
      "Search Home 被继续订阅的概率： 35.3582554517134\n",
      "Discover New 被继续订阅的概率： 45.5531181447822\n",
      "Self profile more 被继续订阅的概率： 42.45283018867924\n",
      "Concert 被继续订阅的概率： 51.06382978723404\n",
      "Payment 被继续订阅的概率： 66.66666666666667\n",
      "nan 被继续订阅的概率： 46.97881409051022\n"
     ]
    }
   ],
   "source": [
    "source_screen_name=['Explore', 'Local playlist more', 'My library','Online playlist more', 'Album more', 'Discover Feature',\n",
    "       'Unknown', 'Discover Chart', 'Radio', 'Artist more', 'Search', 'Others profile more', 'Search Trends', 'Discover Genre',\n",
    "       'My library_Search', 'Search Home', 'Discover New','Self profile more', 'Concert', 'Payment']\n",
    "\n",
    "for screen_name in source_screen_name:\n",
    "    data = df_kkbox[df_kkbox['source_screen_name']==screen_name]\n",
    "    ratio = len(data[data['target']==1])*100.0/len(data)\n",
    "    print(screen_name,'被继续订阅的概率：',ratio)\n",
    "    \n",
    "data = df_kkbox[pd.isna(df_kkbox['source_screen_name'])==True]\n",
    "print('nan 被继续订阅的概率：',len(data[data['target']==1])*100.0/len(data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "source_screen_name: 用户看到的布局的名字，与是否继续订阅有联系，但是无法作为R矩阵的评分标准"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['online-playlist', 'local-playlist', 'local-library',\n",
       "       'top-hits-for-artist', 'album', nan, 'song-based-playlist',\n",
       "       'radio', 'song', 'listen-with', 'artist', 'topic-article-playlist',\n",
       "       'my-daily-playlist'], dtype=object)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_kkbox['source_type'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "online-playlist 被继续订阅的概率： 42.49854161034674\n",
      "local-playlist 被继续订阅的概率： 65.77193393626511\n",
      "local-library 被继续订阅的概率： 63.20980950287853\n",
      "top-hits-for-artist 被继续订阅的概率： 41.85366867006284\n",
      "album 被继续订阅的概率： 39.34206777502179\n",
      "song-based-playlist 被继续订阅的概率： 38.04595135065811\n",
      "radio 被继续订阅的概率： 21.973509083871342\n",
      "song 被继续订阅的概率： 43.74596480904864\n",
      "listen-with 被继续订阅的概率： 31.965028365190157\n",
      "artist 被继续订阅的概率： 57.274522712310734\n",
      "topic-article-playlist 被继续订阅的概率： 49.42826514204038\n",
      "my-daily-playlist 被继续订阅的概率： 37.55656108597285\n",
      "nan 被继续订阅的概率： 52.52332977389851\n"
     ]
    }
   ],
   "source": [
    "source_type=['online-playlist', 'local-playlist', 'local-library', 'top-hits-for-artist', 'album', 'song-based-playlist',\n",
    "       'radio', 'song', 'listen-with', 'artist', 'topic-article-playlist','my-daily-playlist']\n",
    "\n",
    "for src_type in source_type:\n",
    "    data = df_kkbox[df_kkbox['source_type']==src_type]\n",
    "    ratio = len(data[data['target']==1])*100.0/len(data)\n",
    "    print(src_type,'被继续订阅的概率：',ratio)\n",
    "    \n",
    "data = df_kkbox[pd.isna(df_kkbox['source_type'])==True]\n",
    "print('nan 被继续订阅的概率：',len(data[data['target']==1])*100.0/len(data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "source_type: 用户在app上播放音乐的入口的类型，与是否继续订阅有联系，但显然无法作为R矩阵的评分标准"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_kkbox['target'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正样本数： 3714656\n",
      "负样本数： 3714656\n",
      "正负比例： 1.0141679967194155\n"
     ]
    }
   ],
   "source": [
    "t = len(df_kkbox[df_kkbox['target']==1])\n",
    "f = len(df_kkbox[df_kkbox['target']==0])\n",
    "print('正样本数：',t)\n",
    "print('负样本数：',t)\n",
    "print('正负比例：',1.0*t/f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "正负样本基本平衡。可作为用户对item的喜爱程度，作为R矩阵的评分标准"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
